#########
## Nature Human Behaviour 
## Leading Countries in Global Science Increasingly Receive More Citations than Other Countries Despite Doing Similar Research.
## https://doi.org/10.1038/s41562-022-01351-5
## Harvard Dataverse (Code and Metadata): https://doi.org/10.7910/DVN/WCOINR 
## Regression Tables in Supplemental Information
## Data: Data_20210905
#########

#.rs.restartR()
gc(reset=T)
rm(list = ls(all.names = TRUE))


#### Libraries -----------
library("dplyr")
library("ggplot2")
library("scales")
library("texreg")
library("lme4")

#### Set up Parameters -----------
minimum_year <- 1980
maximum_year <- 2017
Percentile_Cutoff_Criterion_ <- 0
Citation_Window_Criterion_ <- 5
Inflation_Criterion_ <- "Field_Deflated"
Journal_Censoring_Criterion_ <- "Since_1980"

#### Functions ---------
Grand_SE <- function(x){return(sqrt(sum(x^2,na.rm=T)/length(x)^2))} #https://stats.stackexchange.com/questions/21104/calculate-average-of-a-set-numbers-with-reported-standard-errors

mean_center <- function(x){return(( (x-mean(x,na.rm=T))/sd(x,na.rm=T)))}

standard_error <- function(x){return(sd(x,na.rm=T)/sqrt(length(x)))}

##### MAG MetaData Information ------------
MAG_Field_ID_df <- read.csv("INPUT_MAG_FieldIDs_Domain.csv") %>%
  dplyr::mutate(Domain = case_when(Domain=="Medicine" ~ "Biomedical, Behavioral,\nand Ecological Sciences",
                                   Domain=="Engineering and Computer Science" ~ "Engineering and\nComputational Sciences",
                                   Domain=="Social Sciences" ~ "Social\nSciences",
                                   Domain=="Life, Behavioral, and Ecological Sciences" ~ "Biomedical, Behavioral,\nand Ecological Sciences",
                                   Domain=="Physical and Mathematical Sciences" ~ "Physical and\nMathematical Sciences",
                                   TRUE ~ as.character(Domain))) %>%
  as.data.frame()
# Read in Files ------------------------------

Country_df <- read.csv("OUTPUT_R_CINC_RandD_UniRanking.csv") %>%
  dplyr::mutate(Country_Name = case_when(Country_Name=="USA"~"UNITED_STATES",
                                         Country_Name=="UK"~"UNITED_KINGDOM",
                                         Country_Name=="PEOPLES_R_CHINA"~"CHINA",
                                         TRUE ~ as.character(Country_Name)))%>%
  dplyr::mutate(Country = stringr::str_to_title(stringr::str_replace(as.character(Country_Name),"_"," "))) %>%
  as.data.frame()

# Read in Data ---------------------------------

Delta_Degree_Centrality_df <- plyr::ldply(list.files(pattern="OUTPUT_Python_Field_Delta_Degree_Centrality_KLD_Citation_Corpus_RAKE_and_GoogleAPI_EnglishOnly_*",
                                                     full.names=T), 
                                          read.csv) %>%  
  dplyr::mutate(Country = stringr::str_to_title(stringr::str_replace(as.character(Country),"_"," "))) %>%
  as.data.frame() %>%
  merge(.,Country_df, 
        by=c("Country","Year"),
        all.x=T) %>%
  subset(Year>=1980) %>%
  mutate(Top_50_Universities = if_else(is.na(Top_50_Universities), 0, Top_50_Universities)) %>%
  mutate(RandD = if_else(is.na(RandD), 0, RandD)) %>%
  dplyr::group_by(Discipline,Citation_Type,Percentile_Cutoff,Journal_Censoring) %>%
  dplyr::mutate(RandD_Scale = scale(RandD),
                Top_50_Universities_Scale = scale(Top_50_Universities),
                Number_of_Papers_Scale = scale(Number_of_Papers),
                InDelta_Citation_ji_MeanCentered = InDelta_Citation_ji-mean(InDelta_Citation_ji,na.rm=T)) %>%
  as.data.frame()
  

KLD_Degree_Centrality_df <- plyr::ldply(list.files(pattern="OUTPUT_Python_Field_Degree_Centrality_KLD_Citation_Corpus_RAKE_and_GoogleAPI_EnglishOnly_*",
                                                   full.names=T), 
                                        read.csv)  %>%
  dplyr::mutate(Country = stringr::str_to_title(stringr::str_replace(as.character(Country),"_"," "))) %>%
  as.data.frame() %>%
  merge(.,Country_df, 
        by=c("Country","Year"),
        all.x=T) %>%
  subset(Year>=1980) %>%
  mutate(Top_50_Universities = if_else(is.na(Top_50_Universities), 0, Top_50_Universities)) %>%
  mutate(RandD = if_else(is.na(RandD), 0, RandD)) %>%
  dplyr::group_by(Discipline,Citation_Type,Percentile_Cutoff,Journal_Censoring) %>%
  dplyr::mutate(RandD_Scale = scale(RandD),
                Top_50_Universities_Scale = scale(Top_50_Universities),
                Number_of_Papers_Scale = scale(Number_of_Papers),
                KLD_InDegree_Centrality_MeanCentered = KLD_InDegree_Centrality-mean(KLD_InDegree_Centrality,na.rm=T)) %>%
  as.data.frame()

# Regression Model | Delta ------------------------------

Delta_model_list_grand <- list()

Delta_model_list_grand[[1]] <- lmer(InDelta_Citation_ij~
                            + (1|Discipline/Country), 
                         data = Delta_Degree_Centrality_df %>%
                           subset(Year>=1980) %>%
                           subset(Citation_Type=="Field_Deflated" & 
                                    Percentile_Cutoff==0&
                                    Journal_Censoring=="Since_1980"))

Delta_model_list_grand[[2]] <- lmer(InDelta_Citation_ij~
                                Top_50_Universities_Scale + (1|Discipline/Country), 
                              data = Delta_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980"))

Delta_model_list_grand[[3]] <- lmer(InDelta_Citation_ij~
                                RandD_Scale + (1|Discipline/Country), 
                              data = Delta_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980")) 

Delta_model_list_grand[[4]] <- lmer(InDelta_Citation_ij~
                                Number_of_Papers_Scale + (1|Discipline/Country), 
                              data = Delta_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980"))

Delta_model_list_grand[[5]] <- lmer(InDelta_Citation_ij~
                                Number_of_Papers_Scale + 
                                Top_50_Universities_Scale+
                                RandD_Scale+
                                (1|Discipline/Country), 
                              data = Delta_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980"))

screenreg(Delta_model_list_grand,
          custom.coef.names = c("(Intercept)","Top 50 Universities","R&D Percentage","Number of Papers"))

# Regression Model | Delta ------------------------------

KLD_model_list_grand <- list()

KLD_model_list_grand[[1]] <- lmer(KLD_InDegree_Centrality~
                                + (1|Discipline/Country), 
                              data = KLD_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980"))

KLD_model_list_grand[[2]] <- lmer(KLD_InDegree_Centrality~
                                Top_50_Universities_Scale + (1|Discipline/Country), 
                              data = KLD_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980"))

KLD_model_list_grand[[3]] <- lmer(KLD_InDegree_Centrality~
                                RandD_Scale + (1|Discipline/Country), 
                              data = KLD_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980")) 

KLD_model_list_grand[[4]] <- lmer(KLD_InDegree_Centrality~
                                Number_of_Papers_Scale + (1|Discipline/Country), 
                              data = KLD_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980"))

KLD_model_list_grand[[5]] <- lmer(KLD_InDegree_Centrality~
                                Number_of_Papers_Scale + 
                                Top_50_Universities_Scale+
                                RandD_Scale+
                                (1|Discipline/Country), 
                              data = KLD_Degree_Centrality_df %>%
                                subset(Year>=1980) %>%
                                subset(Citation_Type=="Field_Deflated" & 
                                         Percentile_Cutoff==0&
                                         Journal_Censoring=="Since_1980"))

screenreg(KLD_model_list_grand,
          custom.coef.names = c("(Intercept)","Top 50 Universities","R&D Percentage","Number of Papers"))

# Output -------------------------

htmlreg(Delta_model_list_grand,
       custom.coef.names = c("(Intercept)","Top 50 Universities","R&D Percentage","Number of Papers"),
       file = "SM_Table_R_DV_Delta_HLM.html")

htmlreg(KLD_model_list_grand,
        custom.coef.names = c("(Intercept)","Top 50 Universities","R&D Percentage","Number of Papers"),
        file = "SM_Table_R_DV_KLD_HLM.html")

